/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

/*!
 * \file src/runtime/contrib/arm_compute_lib/acl_runtime.cc
 * \brief A simple JSON runtime for Arm Compute Library.
 */

#include <tvm/ffi/function.h>
#include <tvm/ffi/reflection/registry.h>
#include <tvm/runtime/tensor.h>

#include "../json/json_node.h"
#include "../json/json_runtime.h"

#ifdef TVM_GRAPH_EXECUTOR_ARM_COMPUTE_LIB
#include <arm_compute/core/Types.h>
#include <arm_compute/runtime/NEON/functions/NEArithmeticAddition.h>
#include <arm_compute/runtime/NEON/functions/NEConcatenateLayer.h>
#include <arm_compute/runtime/NEON/functions/NEConvolutionLayer.h>
#include <arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h>
#include <arm_compute/runtime/NEON/functions/NEElementwiseOperations.h>
#include <arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h>
#include <arm_compute/runtime/NEON/functions/NEPoolingLayer.h>
#include <arm_compute/runtime/NEON/functions/NEReshapeLayer.h>

#include "acl_allocator.h"
#include "acl_utils.h"
#endif

namespace tvm {
namespace runtime {
namespace contrib {

using namespace tvm::runtime::json;

class ACLRuntime : public JSONRuntimeBase {
 public:
  /*!
   * \brief The ACL runtime module. Deserialize the provided functions
   * on creation and store in the layer cache.
   *
   * \param symbol_name The name of the function.
   * \param graph_json serialized JSON representation of a sub-graph.
   * \param const_names The names of each constant in the sub-graph.
   */
  explicit ACLRuntime(const std::string& symbol_name, const std::string& graph_json,
                      const ffi::Array<ffi::String>& const_names)
      : JSONRuntimeBase(symbol_name, graph_json, const_names) {}

  /*!
   * \brief The type key of the module.
   *
   * \return module type key.
   */
  const char* kind() const override { return "arm_compute_lib"; }

  /*!
   * \brief Initialize runtime. Create ACL layer from JSON
   * representation.
   *
   * \param consts The constant params from compiled model.
   */
  void Init(const ffi::Array<Tensor>& consts) override {
    ICHECK_EQ(consts.size(), const_idx_.size())
        << "The number of input constants must match the number of required.";
    SetupConstants(consts);
    BuildEngine();
  }

#ifdef TVM_GRAPH_EXECUTOR_ARM_COMPUTE_LIB
  /*!
   * \brief Unpack inputs and outputs and run inference on a given layer.
   *
   * \param args Access inputs and outputs.
   * \param function The layer to execute inference on.
   * \return Status of inference.
   */
  void Run() override {
    for (size_t nid_idx = 0; nid_idx < input_nodes_.size(); ++nid_idx) {
      auto nid = input_nodes_[nid_idx];
      if (nodes_[nid].GetOpType() == "input") {
        for (uint32_t eid_idx = 0; eid_idx < nodes_[nid].GetNumOutput(); eid_idx++) {
          uint32_t eid = EntryID(nid, eid_idx);
          void* data = data_entry_[eid]->data;
          auto key = std::pair<uint32_t, uint32_t>(nid, eid_idx);
          if (layer_.json_inputid_to_layer_inputid.count(key) > 0) {
            CheckACLError(
                layer_.inputs[layer_.json_inputid_to_layer_inputid[key]].allocator()->import_memory(
                    data));
          } else {
            CheckACLError(layer_.inputs[nid_idx].allocator()->import_memory(data));
          }
        }
      }
    }

    for (size_t i = 0; i < outputs_.size(); ++i) {
      uint32_t eid = EntryID(outputs_[i]);
      void* data = data_entry_[eid]->data;
      CheckACLError(layer_.outputs[i].allocator()->import_memory(data));
    }

    this->layer_.function->run();
  }

 private:
  /*!
   * \brief Build ACL layer from JSON representation and cache.
   *
   * \note For the time being only one layer or operator is supported
   * per engine.
   */
  void BuildEngine() {
    std::shared_ptr<arm_compute::MemoryManagerOnDemand> mm = MakeACLMemoryManager();
    int num_pools = 0;
    bool found_kernel_node = false;
    for (size_t nid = 0; nid < nodes_.size(); ++nid) {
      const auto& node = nodes_[nid];
      if (found_kernel_node) {
        LOG(FATAL)
            << "Arm Compute Library runtime module only supports one kernel node per function.";
      }
      if (node.GetOpType() == "kernel") {
        found_kernel_node = true;
        auto op_name = node.GetOpName();
        if ("nn.conv2d" == op_name || "qnn.conv2d" == op_name) {
          CreateConvolution2DLayer(&layer_, node, mm);
          num_pools++;
        } else if ("nn.depthwise_conv2d" == op_name || "qnn.depthwise_conv2d" == op_name) {
          CreateDepthwiseConvolution2DLayer(&layer_, node, mm);
          num_pools++;
        } else if ("nn.dense" == op_name || "qnn.dense" == op_name) {
          CreateFullyConnectedLayer(&layer_, node, mm);
          num_pools++;
        } else if ("nn.max_pool2d" == op_name || "nn.avg_pool2d" == op_name ||
                   "nn.l2_pool2d" == op_name) {
          CreatePoolingLayer(&layer_, node);
        } else if ("nn.global_max_pool2d" == op_name || "nn.global_avg_pool2d" == op_name) {
          CreateGlobalPoolingLayer(&layer_, node);
        } else if ("reshape" == op_name) {
          CreateReshapeLayer(&layer_, node);
        } else if ("maximum" == op_name) {
          CreateMaximumLayer(&layer_, node);
        } else if ("add" == op_name || "qnn.add" == op_name) {
          CreateAddLayer(&layer_, node);
        } else if ("concatenate" == op_name) {
          CreateConcatenateLayer(&layer_, node);
        } else {
          LOG(FATAL) << "Unsupported op: " << op_name;
        }
      }
    }
    this->layer_.function->prepare();
    if (num_pools > 0) mm->populate(this->allocator_, num_pools);
  }

  /*!
   * \brief ACL objects we cache in order to avoid needing to construct
   * a new layer each time.
   */
  struct CachedLayer {
    std::shared_ptr<arm_compute::IFunction> function;
    std::vector<arm_compute::Tensor> inputs;
    std::vector<arm_compute::Tensor> outputs;
    // maps the input index of JSON node to the index of the ACL layer's inputs
    // this is optional (i.e.only when an operator uses the eid index)
    std::map<std::pair<uint32_t, uint32_t>, uint32_t> json_inputid_to_layer_inputid;
  };

  /*!
   * \brief Create an ACL tensor given the JSON representation. If scale
   * and offset are given, then create a quantized ACL tensor.
   *
   * \param tensor The tensor to represent.
   * \param scale (optional) The scale of the tensor as an input.
   * \param offset (optional) The offset of the tensor as an input.
   * \param apply_dim_correction (Optional) Flag to state whether apply dimension correction after
   * setting one dimension. E.g. when permuting NCHW -> NHWC, 1x1x2 would become 2x1x1, but
   * _num_dimensions should be 3 rather than 1.
   * \param increase_dim_unit (Optional) Set to true if new unit dimensions increase the number of
   * dimensions of the shape.
   * \return ACL Tensor.
   */
  arm_compute::Tensor MakeACLTensorFromJSONEntry(const JSONGraphNodeEntry& tensor,
                                                 JSONGraphNodeEntry* scale = nullptr,
                                                 JSONGraphNodeEntry* offset = nullptr,
                                                 bool apply_dim_correction = true,
                                                 bool increase_dim_unit = true) {
    JSONGraphNode node = nodes_[tensor.id_];
    void* node_data = nullptr;
    if (node.GetOpType() == "const") {
      node_data = data_entry_[EntryID(tensor)]->data;
    }
    return MakeACLTensorFromJSONNode(node, scale, offset, node_data, apply_dim_correction,
                                     increase_dim_unit, tensor.index_);
  }

  /*!
   * \brief Create an ACL tensor given the JSON representation. If scale
   * and offset are given, then create a quantized ACL tensor.
   *
   * \param node The tensor to represent.
   * \param scale (optional) The scale of the tensor as an input.
   * \param offset (optional) The offset of the tensor as an input.
   * \param data (optional) Constant data of input node.
   * \param apply_dim_correction (Optional) Flag to state whether apply dimension correction after
   * setting one dimension. E.g. when permuting NCHW -> NHWC, 1x1x2 would become 2x1x1, but
   * _num_dimensions should be 3 rather than 1.
   * \param increase_dim_unit (Optional) Set to true if new unit dimensions increase the number of
   * dimensions of the shape.
   * \param entry_index The entry index.
   * \return ACL Tensor.
   */
  arm_compute::Tensor MakeACLTensorFromJSONNode(
      const JSONGraphNode& node, JSONGraphNodeEntry* scale = nullptr,
      JSONGraphNodeEntry* offset = nullptr, void* data = nullptr, bool apply_dim_correction = true,
      bool increase_dim_unit = true, uint32_t entry_index = 0) {
    const DLTensor* scale_data = nullptr;
    const DLTensor* offset_data = nullptr;
    if (scale && offset) {
      scale_data = data_entry_[EntryID(*scale)];
      offset_data = data_entry_[EntryID(*offset)];
    }
    return MakeACLTensor(node, data, scale_data, offset_data, apply_dim_correction,
                         increase_dim_unit, entry_index);
  }

  /*!
   * \brief Create a 2D convolution layer.
   *
   * \param layer The ACL layer to build. Containing inputs, outputs and the ACL function.
   * \param node The JSON representation of the operator.
   * \param mm The ACL conv2d layer can request auxiliary memory from TVM.
   */
  void CreateConvolution2DLayer(CachedLayer* layer, const JSONGraphNode& node,
                                const std::shared_ptr<arm_compute::MemoryManagerOnDemand>& mm) {
    std::vector<std::string> padding = node.GetAttr<std::vector<std::string>>("padding");
    std::vector<std::string> strides = node.GetAttr<std::vector<std::string>>("strides");
    std::vector<std::string> dilation = node.GetAttr<std::vector<std::string>>("dilation");
    arm_compute::PadStrideInfo pad_stride_info = MakeACLPadStride(padding, strides);

    int groups = std::stoi(node.GetAttr<std::vector<std::string>>("groups")[0]);
    ICHECK(groups == 1) << "Arm Compute Library NEON convolution only supports group size of 1.";

    arm_compute::ActivationLayerInfo act_info;
    if (node.HasAttr("activation_type")) {
      std::string activation_type = node.GetAttr<std::vector<std::string>>("activation_type")[0];
      act_info = MakeACLActivationInfo(activation_type);
    }

    arm_compute::Size2D dilation_2d(std::stoi(dilation[0]), std::stoi(dilation[1]));

    // Collect inputs and outputs, handling both nn.conv2d and qnn.conv2d cases.
    std::vector<JSONGraphNodeEntry> inputs = node.GetInputs();
    size_t num_inputs = inputs.size();
    bool has_bias;
    if (node.GetOpName() == "qnn.conv2d") {
      ICHECK(num_inputs >= 8U && num_inputs <= 9U)
          << "Quantized convolution requires 9 inputs with a bias, 8 inputs without.";
      has_bias = num_inputs == 9;
      layer->inputs.push_back(MakeACLTensorFromJSONEntry(inputs[0], &inputs[4], &inputs[2]));
      layer->inputs.push_back(MakeACLTensorFromJSONEntry(inputs[1], &inputs[5], &inputs[3]));
      if (has_bias) {
        layer->inputs.push_back(MakeACLTensorFromJSONEntry(inputs[6]));
      }
      layer->outputs.push_back(
          MakeACLTensorFromJSONNode(node, &inputs[6 + has_bias], &inputs[7 + has_bias]));
    } else {
      ICHECK(num_inputs >= 2U && num_inputs <= 3U)
          << "Convolution requires 3 inputs with a bias, 2 inputs without.";
      has_bias = num_inputs == 3;
      for (const auto& i : inputs) {
        layer->inputs.push_back(MakeACLTensorFromJSONEntry(i));
      }
      layer->outputs.push_back(MakeACLTensorFromJSONNode(node));
    }

    auto function = std::make_shared<arm_compute::NEConvolutionLayer>(mm);
    function->configure(&layer->inputs[0], &layer->inputs[1],
                        has_bias ? &layer->inputs[2] : nullptr, &layer->outputs[0], pad_stride_info,
                        arm_compute::WeightsInfo(), dilation_2d, act_info);
    layer->function = function;
  }

  /*!
   * \brief Create a 2D depthwise convolution layer.
   *
   * \param layer The ACL layer to build. Containing inputs, outputs and the ACL function.
   * \param node The JSON representation of the operator.
   * \param mm The ACL conv2d layer can request auxiliary memory from TVM.
   */
  void CreateDepthwiseConvolution2DLayer(
      CachedLayer* layer, const JSONGraphNode& node,
      const std::shared_ptr<arm_compute::MemoryManagerOnDemand>& mm) {
    std::vector<std::string> padding = node.GetAttr<std::vector<std::string>>("padding");
    std::vector<std::string> strides = node.GetAttr<std::vector<std::string>>("strides");
    std::vector<std::string> dilation = node.GetAttr<std::vector<std::string>>("dilation");
    arm_compute::PadStrideInfo pad_stride_info = MakeACLPadStride(padding, strides);

    arm_compute::ActivationLayerInfo act_info;
    if (node.HasAttr("activation_type")) {
      std::string activation_type = node.GetAttr<std::vector<std::string>>("activation_type")[0];
      act_info = MakeACLActivationInfo(activation_type);
    }

    arm_compute::Size2D dilation_2d(std::stoi(dilation[0]), std::stoi(dilation[1]));

    // Collect inputs and outputs, handling both nn.conv2d and qnn.conv2d cases.
    std::vector<JSONGraphNodeEntry> inputs = node.GetInputs();
    size_t num_inputs = inputs.size();
    bool has_bias;
    if (node.GetOpName() == "qnn.depthwise_conv2d") {
      ICHECK(num_inputs >= 8U && num_inputs <= 9U)
          << "Quantized convolution requires 9 inputs with a bias, 8 inputs without.";
      has_bias = num_inputs == 9;
      layer->inputs.push_back(MakeACLTensorFromJSONEntry(inputs[0], &inputs[4], &inputs[2]));
      layer->inputs.push_back(MakeACLTensorFromJSONEntry(inputs[1], &inputs[5], &inputs[3]));
      if (has_bias) {
        layer->inputs.push_back(MakeACLTensorFromJSONEntry(inputs[6]));
      }
      layer->outputs.push_back(
          MakeACLTensorFromJSONNode(node, &inputs[6 + has_bias], &inputs[7 + has_bias]));
    } else {
      ICHECK(num_inputs >= 2U && num_inputs <= 3U)
          << "Convolution requires 3 inputs with a bias, 2 inputs without.";
      has_bias = num_inputs == 3;
      for (const auto& i : inputs) {
        layer->inputs.push_back(MakeACLTensorFromJSONEntry(i));
      }
      layer->outputs.push_back(MakeACLTensorFromJSONNode(node));
    }

    // Depth multiplier is the final dimension in acl weights tensor (IWH*M*)
    int depth_multiplier = layer->inputs[1].info()->tensor_shape()[3];

    auto function = std::make_shared<arm_compute::NEDepthwiseConvolutionLayer>(mm);
    function->configure(&layer->inputs[0], &layer->inputs[1],
                        has_bias ? &layer->inputs[2] : nullptr, &layer->outputs[0], pad_stride_info,
                        depth_multiplier, act_info, dilation_2d);
    layer->function = function;
  }

  /*!
   * \brief Create a fully connected (dense) layer.
   *
   * \param layer The ACL layer to build. Containing inputs, outputs and the ACL function.
   * \param node The JSON representation of the operator.
   * \param mm The ACL fully connected layer can request auxiliary memory from TVM.
   */
  void CreateFullyConnectedLayer(CachedLayer* layer, const JSONGraphNode& node,
                                 const std::shared_ptr<arm_compute::MemoryManagerOnDemand>& mm) {
    arm_compute::FullyConnectedLayerInfo fc_info;
    fc_info.set_weights_trained_layout(arm_compute::DataLayout::NHWC);

    // Collect inputs and outputs, handling both nn.dense and qnn.dense cases.
    std::vector<JSONGraphNodeEntry> inputs = node.GetInputs();
    size_t num_inputs = inputs.size();
    bool has_bias;
    if (node.GetOpName() == "qnn.dense") {
      ICHECK(num_inputs >= 8U && num_inputs <= 9U)
          << "Quantized fully connected (dense) layer requires 9 inputs with a bias, 8 inputs "
             "without.";
      has_bias = num_inputs == 9;
      layer->inputs.push_back(MakeACLTensorFromJSONEntry(inputs[0], &inputs[4], &inputs[2]));
      layer->inputs.push_back(MakeACLTensorFromJSONEntry(inputs[1], &inputs[5], &inputs[3]));
      if (has_bias) {
        layer->inputs.push_back(MakeACLTensorFromJSONEntry(inputs[6]));
      }
      layer->outputs.push_back(
          MakeACLTensorFromJSONNode(node, &inputs[6 + has_bias], &inputs[7 + has_bias]));
    } else {
      ICHECK(num_inputs >= 2U && num_inputs <= 3U)
          << "Fully connected (dense) layer requires 3 inputs with a bias, 2 inputs without.";
      has_bias = num_inputs == 3;
      for (const auto& i : inputs) {
        layer->inputs.push_back(MakeACLTensorFromJSONEntry(i));
      }
      layer->outputs.push_back(MakeACLTensorFromJSONNode(node));
    }

    auto function = std::make_shared<arm_compute::NEFullyConnectedLayer>(mm);
    function->configure(&layer->inputs[0], &layer->inputs[1],
                        has_bias ? &layer->inputs[2] : nullptr, &layer->outputs[0], fc_info);
    layer->function = function;
  }

  /*!
   * \brief Create a pooling layer.
   *
   * \note Currently max_pool2d, avg_pool2d and L2 pooling are supported.
   *
   * \param layer The ACL layer to build. Containing inputs, outputs and the ACL function.
   * \param node The JSON representation of the operator.
   */
  void CreatePoolingLayer(CachedLayer* layer, const JSONGraphNode& node) {
    std::vector<std::string> padding = node.GetAttr<std::vector<std::string>>("padding");
    std::vector<std::string> strides = node.GetAttr<std::vector<std::string>>("strides");
    std::vector<std::string> dilation = node.GetAttr<std::vector<std::string>>("dilation");
    bool ceil_mode = std::stoi(node.GetAttr<std::vector<std::string>>("ceil_mode")[0]);
    arm_compute::PadStrideInfo pad_stride_info = MakeACLPadStride(padding, strides, ceil_mode);
    auto attr_pool_size = node.GetAttr<std::vector<std::string>>("pool_size");
    int pool_size_h = std::stoi(attr_pool_size[0]);
    int pool_size_w = std::stoi(attr_pool_size[1]);

    // Only applies to average pool and l2 pool.
    // ACL exclude pad option is inverse to Relays include pad option.
    bool exclude_pad = false;
    if (node.HasAttr("count_include_pad")) {
      int count_include_pad =
          std::stoi(node.GetAttr<std::vector<std::string>>("count_include_pad")[0]);
      exclude_pad = !count_include_pad;
    }

    arm_compute::PoolingType pool_type;
    if (node.GetOpName() == "nn.max_pool2d") {
      pool_type = arm_compute::PoolingType::MAX;
    } else if (node.GetOpName() == "nn.avg_pool2d") {
      pool_type = arm_compute::PoolingType::AVG;
    } else if (node.GetOpName() == "nn.l2_pool2d") {
      pool_type = arm_compute::PoolingType::L2;
    } else {
      LOG(FATAL) << "Pooling type not supported";
    }

    ICHECK(dilation.size() == 2 && dilation[0] == "1" && dilation[1] == "1")
        << "Dilation other than (1, 1) not supported";
    arm_compute::PoolingLayerInfo pool_info =
        arm_compute::PoolingLayerInfo(pool_type, arm_compute::Size2D(pool_size_h, pool_size_w),
                                      arm_compute::DataLayout::NHWC, pad_stride_info, exclude_pad);

    layer->inputs.push_back(MakeACLTensorFromJSONEntry(node.GetInputs()[0]));
    layer->outputs.push_back(MakeACLTensorFromJSONNode(node));

    auto function = std::make_shared<arm_compute::NEPoolingLayer>();
    function->configure(&layer->inputs[0], &layer->outputs[0], pool_info);
    layer->function = function;
  }

  /*!
   * \brief Create a global pooling layer.
   *
   * \note Currently global_max_pool2d and global_avg_pool2d are supported.
   *
   * \param layer The ACL layer to build. Containing inputs, outputs and the ACL function.
   * \param node The JSON representation of the operator.
   */
  void CreateGlobalPoolingLayer(CachedLayer* layer, const JSONGraphNode& node) {
    arm_compute::PoolingType pool_type;
    if (node.GetOpName() == "nn.global_max_pool2d") {
      pool_type = arm_compute::PoolingType::MAX;
    } else if (node.GetOpName() == "nn.global_avg_pool2d") {
      pool_type = arm_compute::PoolingType::AVG;
    } else {
      LOG(FATAL) << "Pooling type not supported";
    }

    arm_compute::PoolingLayerInfo pool_info =
        arm_compute::PoolingLayerInfo(pool_type, arm_compute::DataLayout::NHWC);

    layer->inputs.push_back(MakeACLTensorFromJSONEntry(node.GetInputs()[0]));
    layer->outputs.push_back(MakeACLTensorFromJSONNode(node));

    auto function = std::make_shared<arm_compute::NEPoolingLayer>();
    function->configure(&layer->inputs[0], &layer->outputs[0], pool_info);
    layer->function = function;
  }

  /*!
   * \brief Create a reshape layer.
   *
   * \param layer The ACL layer to build. Containing inputs, outputs and the ACL function.
   * \param node The JSON representation of the operator.
   */
  void CreateReshapeLayer(CachedLayer* layer, const JSONGraphNode& node) {
    layer->inputs.push_back(MakeACLTensorFromJSONEntry(node.GetInputs()[0]));
    layer->outputs.push_back(MakeACLTensorFromJSONNode(node));
    auto function = std::make_shared<arm_compute::NEReshapeLayer>();
    function->configure(&layer->inputs[0], &layer->outputs[0]);
    layer->function = function;
  }

  /*!
   * \brief Create a maximum layer.
   *
   * \param layer The ACL layer to build. Containing inputs, outputs and the ACL function.
   * \param node The JSON representation of the operator.
   */
  void CreateMaximumLayer(CachedLayer* layer, const JSONGraphNode& node) {
    layer->inputs.push_back(MakeACLTensorFromJSONEntry(node.GetInputs()[0]));
    layer->inputs.push_back(MakeACLTensorFromJSONEntry(node.GetInputs()[1]));
    layer->outputs.push_back(MakeACLTensorFromJSONNode(node));
    auto function = std::make_shared<arm_compute::NEElementwiseMax>();
    function->configure(&layer->inputs[0], &layer->inputs[1], &layer->outputs[0]);
    layer->function = function;
  }
  /*!
   * \brief Creates an add/qnn.add layer
   *
   * \param layer The ACL layer to build. Containing inputs, outputs and the ACL function.
   * \param node  The JSON representation of the operator.
   */
  void CreateAddLayer(CachedLayer* layer, const JSONGraphNode& node) {
    auto op_name = node.GetOpName();
    if ("add" == op_name) {
      layer->inputs.push_back(MakeACLTensorFromJSONEntry(node.GetInputs()[0]));
      layer->inputs.push_back(MakeACLTensorFromJSONEntry(node.GetInputs()[1]));
      layer->outputs.push_back(MakeACLTensorFromJSONNode(node));
    } else if ("qnn.add" == op_name) {
      layer->inputs.push_back(MakeACLTensorFromJSONEntry(node.GetInputs()[0], &node.GetInputs()[2],
                                                         &node.GetInputs()[3]));
      layer->inputs.push_back(MakeACLTensorFromJSONEntry(node.GetInputs()[1], &node.GetInputs()[4],
                                                         &node.GetInputs()[5]));
      layer->outputs.push_back(
          MakeACLTensorFromJSONNode(node, &node.GetInputs()[6], &node.GetInputs()[7]));
    } else {
      LOG(FATAL) << "Unsupported form of add op: " + op_name;
    }

    auto f = std::make_shared<arm_compute::NEArithmeticAddition>();

    // SATURATE is used as add_QASYMM8_QASYMM8_QASYMM8 always saturates result
    f->configure(&layer->inputs[0], &layer->inputs[1], &layer->outputs[0],
                 arm_compute::ConvertPolicy::SATURATE);
    layer->function = f;
  }

  /*!
   * \brief Create a Concatenate layer.
   *
   * \param layer The ACL layer to build. Containing inputs, outputs and the ACL function.c
   * \param node The JSON representation of the operator.
   */
  void CreateConcatenateLayer(CachedLayer* layer, const JSONGraphNode& node) {
    std::vector<std::string> axis = node.GetAttr<std::vector<std::string>>("axis");
    std::vector<const arm_compute::ITensor*> inputs;
    for (auto input : node.GetInputs()) {
      layer->inputs.push_back(MakeACLTensorFromJSONEntry(input, nullptr, nullptr, false));
      layer->json_inputid_to_layer_inputid[std::pair<uint32_t, uint32_t>(input.id_, input.index_)] =
          layer->inputs.size() - 1;
    }
    for (size_t i = 0; i < layer->inputs.size(); i++) {
      inputs.push_back(&layer->inputs[i]);
    }
    layer->outputs.push_back(MakeACLTensorFromJSONNode(node));
    int dimNum = layer->inputs[0].info()->num_dimensions();
    auto function = std::make_shared<arm_compute::NEConcatenateLayer>();
    // the shape of input tensor will be reversed after passing to ACL
    // for example a tensor with shape [1, 2, 3, 4] will be changed to
    // [4, 3, 2, 1] at ACL side. So the axis here should be preprocessed.
    auto a = std::stoi(axis[0]);
    function->configure(inputs, &layer->outputs[0], a < 0 ? -a - 1 : dimNum - a - 1);
    layer->function = function;
  }

  /*! \brief Allow ACL functions to request auxiliary memory from TVM. */
  ACLAllocator allocator_;
  /*!
   * \brief The network layers represented by acl functions.
   * \note Currently only supports a single layer.
   */
  CachedLayer layer_;
#else
  void Run() override {
    LOG(FATAL) << "Cannot call run on Arm Compute Library module without runtime enabled. "
               << "Please build with USE_ARM_COMPUTE_LIB_GRAPH_EXECUTOR.";
  }

  void BuildEngine() {
    LOG(WARNING) << "Arm Compute Library engine is not initialized. "
                 << "Please build with USE_ARM_COMPUTE_LIB_GRAPH_EXECUTOR.";
  }
#endif
};
ffi::Module ACLRuntimeCreate(const ffi::String& symbol_name, const ffi::String& graph_json,
                             const ffi::Array<ffi::String>& const_names) {
  auto n = ffi::make_object<ACLRuntime>(symbol_name, graph_json, const_names);
  return ffi::Module(n);
}

TVM_FFI_STATIC_INIT_BLOCK() {
  namespace refl = tvm::ffi::reflection;
  refl::GlobalDef()
      .def("runtime.arm_compute_lib_runtime_create", ACLRuntimeCreate)
      .def("ffi.Module.load_from_bytes.arm_compute_lib",
           JSONRuntimeBase::LoadFromBytes<ACLRuntime>);
}
}  //  namespace contrib
}  //  namespace runtime
}  //  namespace tvm
